In [84]:
# KNN, or K-Nearest Neighbor Algorithm
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
# Load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data") # Change the working directory
Auto = pd.read_csv("Auto.csv") # Read the data file in the CSV format
Auto['horsepower'] = pd.to_numeric(Auto['horsepower'], errors='coerce')
In [85]:
# Summary of mpg
print(Auto.describe())
mpg cylinders displacement horsepower weight \ count 397.000000 397.000000 397.000000 392.000000 397.000000 mean 23.515869 5.458438 193.532746 104.469388 2970.261965 std 7.825804 1.701577 104.379583 38.491160 847.904119 min 9.000000 3.000000 68.000000 46.000000 1613.000000 25% 17.500000 4.000000 104.000000 75.000000 2223.000000 50% 23.000000 4.000000 146.000000 93.500000 2800.000000 75% 29.000000 8.000000 262.000000 126.000000 3609.000000 max 46.600000 8.000000 455.000000 230.000000 5140.000000 acceleration year origin count 397.000000 397.000000 397.000000 mean 15.555668 75.994962 1.574307 std 2.749995 3.690005 0.802549 min 8.000000 70.000000 1.000000 25% 13.800000 73.000000 1.000000 50% 15.500000 76.000000 1.000000 75% 17.100000 79.000000 2.000000 max 24.800000 82.000000 3.000000
In [86]:
# Some horsepower = NaN, we'll drop these cars.
Auto.dropna(inplace=True)
In [87]:
# Initiate a fuel consumption rating variable.
# 'Economy', the fuel consumption rating, will be defined
# as a categorical variable, based on miles per gallon
Auto['Economy'] = pd.cut(Auto['mpg'],
bins=[-np.inf, 17, 23, 29, np.inf],
labels=['Heavy', 'OK', 'Eco', 'Excellent'])
print(Auto['Economy'].value_counts()) # Group counts
Economy OK 106 Heavy 99 Excellent 95 Eco 92 Name: count, dtype: int64
In [ ]:
# We used sample quartiles of variable mpg to define these ratings, that’s why we got four approximately equal groups.
# Now, we’ll derive a classification rule, using other car characteristics
In [136]:
# Prepare training and testing data, predictors (X) and responses (Y)
X = Auto.iloc[:, 1:7] # columns from cylinders to year
Y = Auto['Economy']
# KNN requires 4 inputs: training X, testing X, training Y, and K.
# We'll use Y_test for performance evaluation and tuning.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42) # Split data at random
In [94]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
knn_result = knn.predict(X_test)
In [96]:
# Confusion matrix and accuracy
conf_matrix = confusion_matrix(Y_test, knn_result)
print(conf_matrix)
[[23 26 0 2] [ 5 31 0 4] [ 0 0 39 9] [23 5 4 25]]
In [98]:
accuracy = accuracy_score(Y_test, knn_result)
print(f'Accuracy: {accuracy}')
Accuracy: 0.6020408163265306
In [138]:
# 60.2% correct classification rate with K=3. Is there a better K? Check accuracy for K from 1 to 20
K = list(range(1,20))
class_rate = []
for k in K:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, Y_train)
knn_result = knn.predict(X_test)
accuracy = accuracy_score(Y_test, knn_result)
class_rate.append(accuracy)
plt.figure
plt.plot(K, class_rate)
plt.xlabel('K'); plt.ylabel('Classification rate'); plt.title('Tuning of the KNN algorithm');
plt.show()
In [140]:
tuning_result = pd.DataFrame({'K':K, 'class_rate':class_rate})
In [142]:
print(tuning_result)
K class_rate 0 1 0.612245 1 2 0.607143 2 3 0.602041 3 4 0.622449 4 5 0.596939 5 6 0.596939 6 7 0.596939 7 8 0.596939 8 9 0.591837 9 10 0.607143 10 11 0.581633 11 12 0.591837 12 13 0.586735 13 14 0.586735 14 15 0.586735 15 16 0.586735 16 17 0.602041 17 18 0.612245 18 19 0.607143
In [ ]:
# Apparently, K=4 provides a better classification with the rate of 62.2%.